# Merges homes in ZTRAX and DIN based on Assessor Parcel Numbers.
pacman::p_load(stringdist, data.table, fst, tidyverse)

rm(list = ls())

source("code/globals.R")

dins <- readRDS(file.path(WORKING, "dins_records.Rds"))
dins <- data.table(dins)

# Late 2016 and 2017 incidents
dins17 <- dins[dins$date >= as.Date("2016-09-01") &
  dins$date < as.Date("2018-10-01"), ]
# 2018 and later incidents
dins18 <- dins[dins$date >= as.Date("2018-10-01") &
  dins$date < as.Date("2020-12-31"), ]

# Ztrax tables of various vintages
attr16 <- read_fst(file.path(WORKING, "ztraxdata/CAattr2016.fst"), as.data.table = T)
attr18 <- read_fst(file.path(WORKING, "ztraxdata/CAattr2018.fst"), as.data.table = T)

## Confirm unique IDs in all datasets
anyDuplicated(dins17[, c("FIPS", "UnformattedAssessorParcelNumber", "INCIDENTNUM")])
anyDuplicated(dins18[, c("FIPS", "UnformattedAssessorParcelNumber", "INCIDENTNUM")])
anyDuplicated(attr16[, c("ImportParcelID", "BuildingOrImprovementNumber")])
anyDuplicated(attr18[, c("ImportParcelID", "BuildingOrImprovementNumber")])

############## hand inspection of county-specific merge rates to reverse engineer each county's APN formatting rules. ################################

### Strategy: Use string matching on address to identify high-quality matches on address field; then compare APN formatting for those records.

## Function to inspect IDs by county in each dataset. Looks for approx string matches on address; then use that to compare APNs
inspectAPN <- function(targetfips, dinsfile, attrfile) {
  # targetfips <- 6005
  # dinsfile <- dins17
  # attrfile <- attr16
  
  # END DEBUG
  

  
  dins <- dinsfile
  attr <- attrfile
  dd <- dins[dins$FIPS == targetfips, c(
    "APN", "UnformattedAssessorParcelNumber", "STREETNUMBER",
    "STREETNAME", "STREETTYPE", "STREETSUFFIX", "CITY", "STRUCTURETYPE",
    "howmanystructures", "badrecordflag"
  )]
  dd <- dd[order(dd$STREETNAME, dd$STREETNUMBER), ]
  dd$address <- tolower(paste(dd$STREETNUMBER, dd$STREETNAME, dd$STREETTYPE, sep = " "))
  aa <- attr[attr$FIPS == targetfips, c(
    "BuildingOrImprovementNumber", "AssessorParcelNumber",
    "UnformattedAssessorParcelNumber", "PropertyFullStreetAddress",
    "NoOfBuildings", "NoOfUnits", "PropertyLandUseStndCode", "PropertyStreetName", "PropertyHouseNumber", "PropertyStreetSuffix"
  )]
  aa <- aa[order(aa$PropertyStreetName, aa$PropertyHouseNumber), ]
  aa$address <- tolower(paste(aa$PropertyHouseNumber, aa$PropertyStreetName, aa$PropertyStreetSuffix))
  names(dd)[names(dd) == "UnformattedAssessorParcelNumber"] <- "UnformattedAPN_DINS"
  names(dd)[names(dd) == "APN"] <- "APN_DINS"
  names(dd)[names(dd) == "address"] <- "address_DINS"
  ## For 100 random fire records, find closest address string in ZTrax
  set.seed(16473)
  dd$rand <- runif(nrow(dd))
  dd <- dd[order(dd$rand), ]
  if (nrow(dd) > 100) {
    dd <- dd[1:100, ]
  }
  table <- data.frame(matrix(nrow = nrow(dd), ncol = 10, data = NA))
  ## Going house by house in the DINS sample, subset Ztrax to only records with identical house NUMBER, then find closest string for full address.
  for (i in seq(1, nrow(dd))) {
    exacthousenumbermatches <- aa[aa$PropertyHouseNumber == dd[i, STREETNUMBER]]
    bestmatch <- amatch(dd[i, address_DINS], exacthousenumbermatches$address, maxDist = 100, method = "dl")
    tablerow <- cbind(
      dd[i, c("UnformattedAPN_DINS", "APN_DINS", "address_DINS", "STRUCTURETYPE", "howmanystructures", "badrecordflag")],
      exacthousenumbermatches[bestmatch, c("PropertyFullStreetAddress", "UnformattedAssessorParcelNumber", "AssessorParcelNumber")]
    )
    tablerow$stringdist <- stringdist(tolower(tablerow$address_DINS), tolower(tablerow$PropertyFullStreetAddress), method = "dl")
    table[i, ] <- tablerow
  }
  names(table) <- names(tablerow)
  table <- table[, c(
    "stringdist", "STRUCTURETYPE", "UnformattedAssessorParcelNumber", "UnformattedAPN_DINS",
    "AssessorParcelNumber", "APN_DINS", "PropertyFullStreetAddress",
    "address_DINS", "howmanystructures", "badrecordflag"
  )]
  table <- table[order(table$stringdist), ]
  table$lengthUAPN_DINS <- nchar(table$UnformattedAPN_DINS)
  table$lengthUAPN_Ztrax <- nchar(table$UnformattedAssessorParcelNumber)
  return(table)
}

## Function to check how many string address matches are perfectly match on UAPN
checkAPN <- function(targetfips, dinsfile, attrfile) {
  v <- inspectAPN(targetfips, dinsfile, attrfile)
  v <- v[v$stringdist <= 10 & !is.na(v$stringdist), ]
  print(nrow(v[v$UnformattedAssessorParcelNumber == v$UnformattedAPN_DINS, ]) / nrow(v))
  v <- v[v$stringdist <= 8 & !is.na(v$stringdist), ]
  print(nrow(v[v$UnformattedAssessorParcelNumber == v$UnformattedAPN_DINS, ]) / nrow(v))
}


###### Apply this function to each county and make notes on required changes

# I separately reformat the APNs in the two datasets to match different APN formatting in the different vintages of ZTRAX data.

#### Some basic operations for reformatting APN numbers.
# 1. Replace dashes.
nodash <- function(dinsfile, fips) {
  dinsfile$UnformattedAssessorParcelNumber[dinsfile$FIPS == fips] <-
    gsub("-", "", dinsfile$UnformattedAssessorParcelNumber, fixed = TRUE)[dinsfile$FIPS == fips]
  return(dinsfile)
}

# 2. Substring
usesubset <- function(dinsfile, fips, start, end) {
  dinsfile$UnformattedAssessorParcelNumber[dinsfile$FIPS == fips] <-
    substr(dinsfile$UnformattedAssessorParcelNumber, start, end)[dinsfile$FIPS == fips]
  return(dinsfile)
}

# 3. Pre-append a zero
pre0 <- function(dinsfile, fips) {
  dinsfile$UnformattedAssessorParcelNumber[dinsfile$FIPS == fips] <-
    paste0("0", dinsfile$UnformattedAssessorParcelNumber[dinsfile$FIPS == fips])
  return(dinsfile)
}

# 4. Post-append a zero
post0 <- function(dinsfile, fips) {
  dinsfile$UnformattedAssessorParcelNumber[dinsfile$FIPS == fips] <-
    paste0(dinsfile$UnformattedAssessorParcelNumber[dinsfile$FIPS == fips], "0")
  return(dinsfile)
}


### 2017 INCIDENTS CLEANUP

b <- dins17[, .N, by = FIPS]
b <- b[order(as.numeric(b$FIPS)), ]
b$FIPS


# 6005
inspectAPN(6005, dins17, attr16)
dins17 <- usesubset(dins17, 6005, 1, 9)
checkAPN(6005, dins17, attr16)

# 6007
inspectAPN(6007, dins17, attr16)
dins17 <- nodash(dins17, 6007)
dins17 <- usesubset(dins17, 6007, 1, 9)
checkAPN(6007, dins17, attr16)

# 6009 #Poor match, single home record, mobile home. Come back and drop at end.
inspectAPN(6009, dins17, attr16)

# 6013   THIS MANUAL FIX IS FROM WES'S CODE
inspectAPN(6013, dins17, attr16)
dins17$UnformattedAssessorParcelNumber[dins17$FIPS == 6013 &
  dins17$UnformattedAssessorParcelNumber == "001041002"] <- "0010410025"
dins17$UnformattedAssessorParcelNumber[dins17$FIPS == 6013 &
  dins17$UnformattedAssessorParcelNumber == "078190014"] <- "0781900147"

# 6017
inspectAPN(6017, dins17, attr16)
dins17$UnformattedAssessorParcelNumber[dins17$FIPS == 6017] <-
  paste0(dins17$UnformattedAssessorParcelNumber[dins17$FIPS == 6017], "100")
checkAPN(6017, dins17, attr16)

# 6019 #No changes needed
inspectAPN(6019, dins17, attr16)
checkAPN(6019, dins17, attr16)

# 6023 #Poor merge here; only 2 records. Come back and drop at end.
inspectAPN(6023, dins17, attr16)
dins17[dins17$FIPS == 6023, ]

# 6029 # No changes needed
inspectAPN(6029, dins17, attr16)
checkAPN(6029, dins17, attr16)

# 6031 #Poor merge, no address information in DINS, 1 record only. Come back, drop.
inspectAPN(6031, dins17, attr16)

# 6033 # No changes needed
inspectAPN(6033, dins17, attr16)
checkAPN(6033, dins17, attr16)

# 6035 #Poor match, 3 records, 2 of which are mobile homes. Come back, drop.
inspectAPN(6035, dins17, attr16)

# 6037 #No changes needed
inspectAPN(6037, dins17, attr16)
checkAPN(6037, dins17, attr16)

# 6039
inspectAPN(6039, dins17, attr16)
dins17 <- nodash(dins17, 6039)
checkAPN(6039, dins17, attr16)

# 6043
inspectAPN(6043, dins17, attr16)
dins17 <- nodash(dins17, 6043)
checkAPN(6043, dins17, attr16)

# 6045 #No changes needed
inspectAPN(6045, dins17, attr16)
checkAPN(6045, dins17, attr16)

# 6051 #No changes needed
inspectAPN(6051, dins17, attr16)
checkAPN(6051, dins17, attr16)

# 6055
inspectAPN(6055, dins17, attr16)
dins17 <- usesubset(dins17, 6055, 1, 9)
checkAPN(6055, dins17, attr16)

# 6057
inspectAPN(6057, dins17, attr16)
dins17 <- usesubset(dins17, 6057, 1, 7)
checkAPN(6057, dins17, attr16)

# 6059
inspectAPN(6059, dins17, attr16)
dins17 <- nodash(dins17, 6059)
checkAPN(6059, dins17, attr16)

# 6061
inspectAPN(6061, dins17, attr16)
dins17 <- nodash(dins17, 6061)
dins17 <- usesubset(dins17, 6061, 1, 9)
checkAPN(6061, dins17, attr16)

# 6065 #No changes needed
inspectAPN(6065, dins17, attr16)
checkAPN(6065, dins17, attr16)

# 6073 #No changes needed
inspectAPN(6073, dins17, attr16)
checkAPN(6073, dins17, attr16)

# 6079
inspectAPN(6079, dins17, attr16)
dins17 <- nodash(dins17, 6079)
checkAPN(6079, dins17, attr16)

# 6083
inspectAPN(6083, dins17, attr16)
dins17 <- nodash(dins17, 6083)
checkAPN(6083, dins17, attr16)

# 6085 # Poor merge; many records have 0 for house number in DINs. Come back and drop. A late 2016 fire.
inspectAPN(6085, dins17, attr16)

# 6087
inspectAPN(6087, dins17, attr16)
dins17 <- nodash(dins17, 6087)
checkAPN(6087, dins17, attr16)

# 6089 #No change needed
inspectAPN(6089, dins17, attr16)
checkAPN(6089, dins17, attr16)

# 6093
inspectAPN(6093, dins17, attr16)
dins17 <- nodash(dins17, 6093)
checkAPN(6093, dins17, attr16)

# 6095 #No changes needed
inspectAPN(6095, dins17, attr16)
checkAPN(6095, dins17, attr16)

# 6097
inspectAPN(6097, dins17, attr16)
dins17 <- nodash(dins17, 6097)
checkAPN(6097, dins17, attr16)

# 6103 #Very poor match; only 4 records, 2 are mobile homes. Come back and drop.
inspectAPN(6103, dins17, attr16)

# 6105
inspectAPN(6105, dins17, attr16)
dins17 <- nodash(dins17, 6105)
dins17 <- usesubset(dins17, 6105, 1, 8)
checkAPN(6105, dins17, attr16)

# 6109 Poor match; 1 record only, mobile home. Come back and drop.
inspectAPN(6109, dins17, attr16)

# 6111   # REQUIRES CHANGES ON ZILLOW SIDE
inspectAPN(6111, dins17, attr16)

# 6113
inspectAPN(6113, dins17, attr16)
dins17$UnformattedAssessorParcelNumber[dins17$FIPS == 6113] <-
  gsub(" ", "", dins17$UnformattedAssessorParcelNumber, fixed = TRUE)[dins17$FIPS == 6113]
dins17$UnformattedAssessorParcelNumber[dins17$FIPS == 6113] <- paste0(
  substr(dins17$UnformattedAssessorParcelNumber, 1, 6)[dins17$FIPS == 6113],
  "0",
  substr(dins17$UnformattedAssessorParcelNumber, 7, 8)[dins17$FIPS == 6113],
  "000"
)
checkAPN(6113, dins17, attr16)

# 6115
inspectAPN(6115, dins17, attr16)
dins17 <- usesubset(dins17, 6115, 1, 9)
checkAPN(6115, dins17, attr16)


########### ZILLOW SIDE CHANGES FOR ATTR16

# 6111
# APNs in this county carry 1 extra digit in the FINAL position compared to the DINS APNs (often but not always a '5'). Trimming off
# this extra digit does not seem to sacrifice any information. The trimmed APNs no longer uniquely identify homes in Ztrax -- there are now 30ish duplicate records -- but
# importantly, none of these are homes that appear in DINS so this is still valid and usable as a merge ID.
inspectAPN(6111, dins17, attr16)
# confirm no loss of information when using this as a merge ID to DINS
g <- attr16[attr16$FIPS == 6111, c("AssessorParcelNumber", "UnformattedAssessorParcelNumber", "PropertyFullStreetAddress")]
g$trimmedapn <- substr(g$UnformattedAssessorParcelNumber, 1, 9)
gg <- g[duplicated(g$trimmedapn) | duplicated(g$trimmedapn, fromLast = TRUE), ]
gg[order(gg$trimmedapn), ]
sum(unique(gg$trimmedapn) %in% dins17$UnformattedAssessorParcelNumber[dins17$FIPS == 6111])
stopifnot(sum(unique(gg$trimmedapn) %in% dins17$UnformattedAssessorParcelNumber[dins17$FIPS == 6111]) == 0) # Confirm that none of the duplicates after trimming APN in ZTRAX are records that appear in DINS
# Make the change in ZTRAX
attr16$UnformattedAssessorParcelNumber[attr16$FIPS == 6111] <- substr(attr16$UnformattedAssessorParcelNumber, 1, 9)[attr16$FIPS == 6111]
rm(g, gg)



########## THEN FOR 2018 AND LATER INCIDENTS

b <- dins18[, .N, by = FIPS]
b <- b[order(as.numeric(b$FIPS)), ]
b$FIPS

# 6001:
## Poor quality merge
inspectAPN(6001, dins18, attr18)
dins18 <- nodash(dins18, 6001)
checkAPN(6001, dins18, attr18)

# 6007:
inspectAPN(6007, dins18, attr18)
dins18 <- nodash(dins18, 6007)
checkAPN(6007, dins18, attr18)

# 6017:  # One home and one mobile home; poor merge info; drop
inspectAPN(6017, dins18, attr18)

# 6019:  # No changes needed
inspectAPN(6019, dins18, attr18)
checkAPN(6019, dins18, attr18)

# 6021:
inspectAPN(6021, dins18, attr18)
dins18 <- post0(dins18, 6021)
dins18 <- post0(dins18, 6021)
checkAPN(6021, dins18, attr18)

# 6029 - Requires Zillow side changes, see below.
inspectAPN(6029, dins18, attr18)

# 6033 - Requires Zillow side changes, see below.
inspectAPN(6033, dins18, attr18)

# 6035
inspectAPN(6035, dins18, attr18)
dins18 <- nodash(dins18, 6035)
checkAPN(6035, dins18, attr18)

# 6037: No changes
inspectAPN(6037, dins18, attr18)
checkAPN(6037, dins18, attr18)

# 6039
inspectAPN(6039, dins18, attr18)
dins18 <- nodash(dins18, 6039)
dins18 <- post0(dins18, 6039)
dins18 <- post0(dins18, 6039)
dins18 <- post0(dins18, 6039)
checkAPN(6039, dins18, attr18)

# 6045
inspectAPN(6045, dins18, attr18)
dins18 <- post0(dins18, 6045)
dins18 <- post0(dins18, 6045)
checkAPN(6045, dins18, attr18)

# 6053 - No changes needed
inspectAPN(6053, dins18, attr18)
checkAPN(6053, dins18, attr18)

# 6055 - No changes needed
inspectAPN(6055, dins18, attr18)
checkAPN(6055, dins18, attr18)

# 6057 - No changes needed
inspectAPN(6057, dins18, attr18)
checkAPN(6057, dins18, attr18)

# 6059
inspectAPN(6059, dins18, attr18)
dins18 <- nodash(dins18, 6059)
checkAPN(6059, dins18, attr18)

# 6063
inspectAPN(6063, dins18, attr18)
dins18 <- post0(dins18, 6063)
dins18 <- post0(dins18, 6063)
dins18 <- post0(dins18, 6063)
checkAPN(6063, dins18, attr18)

# 6065 - No changes needed
inspectAPN(6065, dins18, attr18)
checkAPN(6065, dins18, attr18)

# 6069 - Poor merge, only 3 homes -- drop
inspectAPN(6069, dins18, attr18)

# 6071
inspectAPN(6071, dins18, attr18)
dins18 <- post0(dins18, 6071)
dins18 <- post0(dins18, 6071)
dins18 <- post0(dins18, 6071)
dins18 <- post0(dins18, 6071)
checkAPN(6071, dins18, attr18)

# 6073 - No changes needed
inspectAPN(6073, dins18, attr18)

# 6077
inspectAPN(6077, dins18, attr18)
dins18 <- post0(dins18, 6077)
dins18 <- post0(dins18, 6077)
dins18 <- post0(dins18, 6077)
dins18 <- post0(dins18, 6077)
checkAPN(6077, dins18, attr18)

# 6079
inspectAPN(6079, dins18, attr18)
dins18 <- nodash(dins18, 6079)
checkAPN(6079, dins18, attr18)

# 6081 - No changes needed; note poor address information for many DINS homes.
inspectAPN(6081, dins18, attr18)
checkAPN(6081, dins18, attr18)

# 6085 ## No obvious fixes, but very poor merge. 170 homes. Drop; could try to fix later because of large number of homes here.
inspectAPN(6085, dins18, attr18)
checkAPN(6085, dins18, attr18)

# 6087
inspectAPN(6087, dins18, attr18)
dins18 <- nodash(dins18, 6087)
checkAPN(6087, dins18, attr18)

# 6089 - No changes needed
inspectAPN(6089, dins18, attr18)
checkAPN(6089, dins18, attr18)

# 6093
inspectAPN(6093, dins18, attr18)
dins18 <- nodash(dins18, 6093)
dins18 <- post0(dins18, 6093)
dins18 <- post0(dins18, 6093)
dins18 <- post0(dins18, 6093)
checkAPN(6093, dins18, attr18)

# 6095 - No changes needed
inspectAPN(6095, dins18, attr18)
checkAPN(6095, dins18, attr18)

# 6097
inspectAPN(6097, dins18, attr18)
dins18 <- nodash(dins18, 6097)
dins18 <- post0(dins18, 6097)
dins18 <- post0(dins18, 6097)
dins18 <- post0(dins18, 6097)
checkAPN(6097, dins18, attr18)

# 6099
inspectAPN(6099, dins18, attr18)
dins18 <- post0(dins18, 6099)
dins18 <- post0(dins18, 6099)
dins18 <- post0(dins18, 6099)
checkAPN(6099, dins18, attr18)

# 6103 ## Poor merge information, only 9 houses. Drop it.
inspectAPN(6103, dins18, attr18)

# 6105
inspectAPN(6105, dins18, attr18)
dins18 <- nodash(dins18, 6105)
checkAPN(6105, dins18, attr18)

# 6107
inspectAPN(6107, dins18, attr18)
dins18 <- nodash(dins18, 6107)
dins18 <- post0(dins18, 6107)
dins18 <- post0(dins18, 6107)
dins18 <- post0(dins18, 6107)
checkAPN(6107, dins18, attr18)

# 6111: No changes on DINs side; I make a change on Zillow side below.
inspectAPN(6111, dins18, attr18)

# 6113
inspectAPN(6113, dins18, attr18)
dins18 <- post0(dins18, 6113)
dins18 <- post0(dins18, 6113)
dins18 <- post0(dins18, 6113)
checkAPN(6113, dins18, attr18)

# 6115 - No changes needed
inspectAPN(6115, dins18, attr18)
checkAPN(6115, dins18, attr18)

########################## Changes on Zillow Side in ATTR18


# 6029
# APNs in this county carry 3 extra digitss in the FINAL positions compared to the DINS APNs. Trimming off
# these extra digits does not seem to sacrifice any information. The trimmed APNs no longer uniquely identify homes in Ztrax -- there are now 15ish duplicate records -- but
# importantly, none of these are homes that appear in DINS so this is still valid and usable as a merge ID.
inspectAPN(6029, dins18, attr18)
# confirm no loss of information when using this as a merge ID to DINS
g <- attr18[attr18$FIPS == 6029, c("AssessorParcelNumber", "UnformattedAssessorParcelNumber", "PropertyFullStreetAddress")]
g$trimmedapn <- substr(g$UnformattedAssessorParcelNumber, 1, 8)
gg <- g[duplicated(g$trimmedapn) | duplicated(g$trimmedapn, fromLast = TRUE), ]
gg[order(gg$trimmedapn), ]
sum(unique(gg$trimmedapn) %in% dins18$UnformattedAssessorParcelNumber[dins18$FIPS == 6029])
# Make the change in ZTRAX
attr18$UnformattedAssessorParcelNumber[attr18$FIPS == 6029] <-
  substr(attr18$UnformattedAssessorParcelNumber, 1, 8)[attr18$FIPS == 6029]
rm(g, gg)

# 6033: SAME ISSUE
inspectAPN(6033, dins18, attr18)
# confirm no loss of information when using this as a merge ID to DINS
g <- attr18[attr18$FIPS == 6033, c("AssessorParcelNumber", "UnformattedAssessorParcelNumber", "PropertyFullStreetAddress")]
g$trimmedapn <- substr(g$UnformattedAssessorParcelNumber, 1, 8)
gg <- g[duplicated(g$trimmedapn) | duplicated(g$trimmedapn, fromLast = TRUE), ]
gg[order(gg$trimmedapn), ]
sum(unique(gg$trimmedapn) %in% dins18$UnformattedAssessorParcelNumber[dins18$FIPS == 6033])
# Make the change in ZTRAX
attr18$UnformattedAssessorParcelNumber[attr18$FIPS == 6033] <-
  substr(attr18$UnformattedAssessorParcelNumber, 1, 8)[attr18$FIPS == 6033]
rm(g, gg)

# 6111
# APNs in this county carry 1 extra digit in the FINAL position compared to the DINS APNs (often but not always a '5'). Trimming off
# this extra digit does not seem to sacrifice any information. The trimmed APNs no longer uniquely identify homes in Ztrax -- there are now 30ish duplicate records -- but
# importantly, none of these are homes that appear in DINS so this is still valid and usable as a merge ID.
inspectAPN(6111, dins18, attr18)
# confirm no loss of information when using this as a merge ID to DINS
g <- attr18[attr18$FIPS == 6111, c("AssessorParcelNumber", "UnformattedAssessorParcelNumber", "PropertyFullStreetAddress")]
g$trimmedapn <- substr(g$UnformattedAssessorParcelNumber, 1, 9)
gg <- g[duplicated(g$trimmedapn) | duplicated(g$trimmedapn, fromLast = TRUE), ]
gg[order(gg$trimmedapn), ]
sum(unique(gg$trimmedapn) %in% dins18$UnformattedAssessorParcelNumber[dins18$FIPS == 6111])
stopifnot(sum(unique(gg$trimmedapn) %in% dins18$UnformattedAssessorParcelNumber[dins18$FIPS == 6111]) == 0) # Confirm that none of the duplicates after trimming APN in ZTRAX are records that appear in DINS
# Make the change in ZTRAX
attr18$UnformattedAssessorParcelNumber[attr18$FIPS == 6111] <-
  substr(attr18$UnformattedAssessorParcelNumber, 1, 9)[attr18$FIPS == 6111]
rm(g, gg)


# 6113
# THere is 1 parcel in this county that appears in dins18 AND is not uniquely identified in attr18 by FIPS,APN. I remove the 1 duplicate observation in this county.
### -- optional: show the duplicated observation
d <- merge(attr18, dins18[, .(FIPS, UnformattedAssessorParcelNumber)],
  by = c("FIPS", "UnformattedAssessorParcelNumber")
)
d[duplicated(d[, c("FIPS", "UnformattedAssessorParcelNumber")]) |
  duplicated(d[, c("FIPS", "UnformattedAssessorParcelNumber")], fromLast = TRUE), ]
### -- Remove it
w <- which(attr18$FIPS == 6113 & attr18$UnformattedAssessorParcelNumber == "030320008000" & attr18$BuildingOrImprovementNumber == 2)
attr18 <- attr18[-w, ]


#################### Drop county-incidents I noted to drop ############
dropfromdins17 <- c(6009, 6023, 6031, 6035, 6085, 6103, 6109)
dins17 <- dins17[!dins17$FIPS %in% dropfromdins17, ]
dropfromdins18 <- c(6017, 6069, 6085, 6103)
dins18 <- dins18[!dins18$FIPS %in% dropfromdins18, ]

# Limit ZTRAX data to just counties in DINS
attr16 <- attr16[FIPS %in% unique(dins17$FIPS), ]
attr18 <- attr18[FIPS %in% unique(dins18$FIPS), ]



write_fst(dins17, file.path(WORKING, "dins17_cleaned.fst"))
write_fst(dins18, file.path(WORKING, "dins18_cleaned.fst"))
write_fst(attr16, file.path(WORKING, "attr16_cleaned.fst"))
write_fst(attr18, file.path(WORKING, "attr18_cleaned.fst"))
